# Using read instead of fread because fread have some troubles with handling NA values
library(readr)
posting <- read_csv("postings.csv")
## Rows: 123849 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): company_name, title, description, pay_period, location, formatted_...
## dbl (14): job_id, max_salary, company_id, views, med_salary, min_salary, app...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(posting)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Changes:

Include original_list_time, expiry and closed_time. All time are in UNIX time in millisecond

posting_clean <- posting %>% 
  select(job_id, company_name, title, max_salary, med_salary, min_salary, pay_period, location, company_id, formatted_work_type,original_listed_time, expiry, closed_time, applies, remote_allowed, application_type, formatted_experience_level, sponsored, work_type, currency, zip_code, fips)

Step 1

First, we will see if there are any completely duplicated rows, to see if there are any duplicates in the data retrieval processes. Note that we will also include job_id when checking for completely duplicated rows, because if the same job has two different ids, that job is posted twice, In this case, we will check how often the same jobs are posted more than once, and we will decide what to do with it accordingly

#View(posting_clean)
duplicated_posting_clean <- posting_clean[duplicated(posting_clean) | duplicated(posting_clean, fromLast = TRUE), ]
duplicated_posting_clean
## # A tibble: 0 × 22
## # ℹ 22 variables: job_id <dbl>, company_name <chr>, title <chr>,
## #   max_salary <dbl>, med_salary <dbl>, min_salary <dbl>, pay_period <chr>,
## #   location <chr>, company_id <dbl>, formatted_work_type <chr>,
## #   original_listed_time <dbl>, expiry <dbl>, closed_time <dbl>, applies <dbl>,
## #   remote_allowed <dbl>, application_type <chr>,
## #   formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## #   currency <chr>, zip_code <chr>, fips <chr>
posting_clean_no_id <- posting_clean %>% 
  select(-job_id, -company_id, -original_listed_time, -expiry, -closed_time)
#View(posting_clean_no_id)
duplicated_posting_clean_no_id <- posting_clean_no_id[duplicated(posting_clean_no_id) | duplicated(posting_clean_no_id, fromLast = TRUE), ]
duplicated_posting_clean_no_id
## # A tibble: 10,045 × 17
##    company_name  title      max_salary med_salary min_salary pay_period location
##    <chr>         <chr>           <dbl>      <dbl>      <dbl> <chr>      <chr>   
##  1 City of Tempe Deputy Ci…   274206           NA   205398   YEARLY     Tempe, …
##  2 SKF Group     Technician       NA           NA       NA   <NA>       Plymout…
##  3 SKF Group     Heat Trea…       27.9         NA       19.6 HOURLY     Muskego…
##  4 SKF Group     Quality E…       NA           NA       NA   <NA>       Lansdal…
##  5 SKF Group     Maintenan…       NA           NA       NA   <NA>       Sumter,…
##  6 SKF Group     Manufactu…       NA           NA       NA   <NA>       Dexter,…
##  7 SKF Group     Regional …       NA           NA       NA   <NA>       Cincinn…
##  8 SKF Group     Machining…       NA           NA       NA   <NA>       Muskego…
##  9 SKF Group     CNC Machi…       NA           NA       NA   <NA>       Hanover…
## 10 SKF Group     Maintenan…       NA           NA       NA   <NA>       Ladson,…
## # ℹ 10,035 more rows
## # ℹ 10 more variables: formatted_work_type <chr>, applies <dbl>,
## #   remote_allowed <dbl>, application_type <chr>,
## #   formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## #   currency <chr>, zip_code <chr>, fips <chr>

Since with job_id included, there were no duplicates, and upon inspection of the duplicated rows without the id columns, it is safe to assume that all these duplicated rows without ids are separate job posting, or are the result of companies posting more than once for the same positions, or are changes to the job posting and that they post another job after making those changes to the posting. Due to these facts, we will be keeping all of the duplicated rows without ids, since they are not really duplicates. Now, we will go on to inspect the data closer.

summary(posting_clean)
##      job_id          company_name          title             max_salary       
##  Min.   :9.217e+05   Length:123849      Length:123849      Min.   :        1  
##  1st Qu.:3.895e+09   Class :character   Class :character   1st Qu.:       48  
##  Median :3.902e+09   Mode  :character   Mode  :character   Median :    80000  
##  Mean   :3.896e+09                                         Mean   :    91939  
##  3rd Qu.:3.905e+09                                         3rd Qu.:   140000  
##  Max.   :3.906e+09                                         Max.   :120000000  
##                                                            NA's   :94056      
##    med_salary         min_salary        pay_period          location        
##  Min.   :     0.0   Min.   :       1   Length:123849      Length:123849     
##  1st Qu.:    18.9   1st Qu.:      37   Class :character   Class :character  
##  Median :    25.5   Median :   60000   Mode  :character   Mode  :character  
##  Mean   : 22015.6   Mean   :   64911                                        
##  3rd Qu.:  2510.5   3rd Qu.:  100000                                        
##  Max.   :750000.0   Max.   :85000000                                        
##  NA's   :117569     NA's   :94056                                           
##    company_id        formatted_work_type original_listed_time
##  Min.   :     1009   Length:123849       Min.   :1.702e+12   
##  1st Qu.:    14352   Class :character    1st Qu.:1.713e+12   
##  Median :   226965   Mode  :character    Median :1.713e+12   
##  Mean   : 12204012                       Mean   :1.713e+12   
##  3rd Qu.:  8047188                       3rd Qu.:1.713e+12   
##  Max.   :103472979                       Max.   :1.714e+12   
##  NA's   :1717                                                
##      expiry           closed_time           applies       remote_allowed  
##  Min.   :1.713e+12   Min.   :1.712e+12   Min.   :  1.00   Min.   :1       
##  1st Qu.:1.715e+12   1st Qu.:1.713e+12   1st Qu.:  1.00   1st Qu.:1       
##  Median :1.716e+12   Median :1.713e+12   Median :  3.00   Median :1       
##  Mean   :1.716e+12   Mean   :1.713e+12   Mean   : 10.59   Mean   :1       
##  3rd Qu.:1.716e+12   3rd Qu.:1.713e+12   3rd Qu.:  8.00   3rd Qu.:1       
##  Max.   :1.729e+12   Max.   :1.714e+12   Max.   :967.00   Max.   :1       
##                      NA's   :122776      NA's   :100529   NA's   :108603  
##  application_type   formatted_experience_level   sponsored  work_type        
##  Length:123849      Length:123849              Min.   :0   Length:123849     
##  Class :character   Class :character           1st Qu.:0   Class :character  
##  Mode  :character   Mode  :character           Median :0   Mode  :character  
##                                                Mean   :0                     
##                                                3rd Qu.:0                     
##                                                Max.   :0                     
##                                                                              
##    currency           zip_code             fips          
##  Length:123849      Length:123849      Length:123849     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
str(posting_clean)
## tibble [123,849 × 22] (S3: tbl_df/tbl/data.frame)
##  $ job_id                    : num [1:123849] 921716 1829192 10998357 23221523 35982263 ...
##  $ company_name              : chr [1:123849] "Corcoran Sawyer Smith" NA "The National Exemplar" "Abrams Fensterman, LLP" ...
##  $ title                     : chr [1:123849] "Marketing Coordinator" "Mental Health Therapist/Counselor" "Assitant Restaurant Manager" "Senior Elder Law / Trusts and Estates Associate Attorney" ...
##  $ max_salary                : num [1:123849] 20 50 65000 175000 80000 20 300000 120000 NA NA ...
##  $ med_salary                : num [1:123849] NA NA NA NA NA NA NA NA NA 350 ...
##  $ min_salary                : num [1:123849] 17 30 45000 140000 60000 14 60000 90000 NA NA ...
##  $ pay_period                : chr [1:123849] "HOURLY" "HOURLY" "YEARLY" "YEARLY" ...
##  $ location                  : chr [1:123849] "Princeton, NJ" "Fort Collins, CO" "Cincinnati, OH" "New Hyde Park, NY" ...
##  $ company_id                : num [1:123849] 2774458 NA 64896719 766262 NA ...
##  $ formatted_work_type       : chr [1:123849] "Full-time" "Full-time" "Full-time" "Full-time" ...
##  $ original_listed_time      : num [1:123849] 1.71e+12 1.71e+12 1.71e+12 1.71e+12 1.71e+12 ...
##  $ expiry                    : num [1:123849] 1.72e+12 1.72e+12 1.72e+12 1.72e+12 1.72e+12 ...
##  $ closed_time               : num [1:123849] NA NA NA NA NA NA NA NA NA NA ...
##  $ applies                   : num [1:123849] 2 NA NA NA NA 4 1 NA NA NA ...
##  $ remote_allowed            : num [1:123849] NA NA NA NA NA NA 1 NA NA NA ...
##  $ application_type          : chr [1:123849] "ComplexOnsiteApply" "ComplexOnsiteApply" "ComplexOnsiteApply" "ComplexOnsiteApply" ...
##  $ formatted_experience_level: chr [1:123849] NA NA NA NA ...
##  $ sponsored                 : num [1:123849] 0 0 0 0 0 0 0 0 0 0 ...
##  $ work_type                 : chr [1:123849] "FULL_TIME" "FULL_TIME" "FULL_TIME" "FULL_TIME" ...
##  $ currency                  : chr [1:123849] "USD" "USD" "USD" "USD" ...
##  $ zip_code                  : chr [1:123849] "08540" "80521" "45202" "11040" ...
##  $ fips                      : chr [1:123849] "34021" "08069" "39061" "36059" ...

Formatting to universal currency: USD Since we will need to format other columns such as max, med, min salary, pay period, we need to tackle the currency column first We know that there are multiple currency. First, we will factor currency to find out how many levels it has, and its distribution Since this is the data set of LinkedIn job posting in the United States, we can safely assume that if the currency is not specified, it is USD. To support this theory, we can plot the distribution of other currencies, and we can observe that NA and USD is the default. We will created a subset data where the currency is neither USD or NA to see how many observation of other currency we have

posting_clean$currency <- factor(posting_clean$currency)
levels(posting_clean$currency)
## [1] "AUD" "BBD" "CAD" "EUR" "GBP" "USD"

Factoring currency

# Using a log scale for y to visualize other currency
ggplot(posting_clean, aes(x = currency, y = ..count..)) +
  geom_bar() +
  scale_y_log10(labels = scales::comma)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Filtering the data for odd currency
posting_clean_odd_currency <- posting_clean %>% 
  filter(!is.na(posting_clean$currency)) %>% 
  filter(!currency == "USD")
# Making a bar chart of the distribution of odd currency
ggplot(posting_clean_odd_currency, aes(x = currency, y = ..count..)) +
  geom_bar()

Since there are only a handful of odd currency, we can safely assume that these are outliers, so the default currency is USD. Now, we will repopulate the currency columns for USD value in place of NA’s.

# Converting into string text again before repopulate
posting_clean$currency <- as.character(posting_clean$currency)

# Repopulating the currency column
posting_clean <- posting_clean %>% 
  mutate(currency = ifelse(is.na(currency), "USD", currency))
# Factoring the currencies again 
posting_clean$currency <- factor(posting_clean$currency)
# Using a log scale for y to visualize other currency
ggplot(posting_clean, aes(x = currency, y = ..count..)) +
  geom_bar() +
  scale_y_log10(labels = scales::comma)

Step 2:

Now, we will normalized other columns that is affected by currency like salary and pay rate. First, we have to figure out how many types of pay there are

ggplot(posting_clean, aes(x = pay_period, y = ..count..)) +
  geom_bar() +
  scale_y_log10(labels = scales::comma) 

Since NA pay takes quite a large proportion, we can assume that jobs that don’t have a specified pay period don’t include min/med/max salaries. We check this

posting_clean_pay_null <- posting_clean %>% 
  filter(is.na(pay_period))
posting_clean_pay_null
## # A tibble: 87,776 × 22
##       job_id company_name      title max_salary med_salary min_salary pay_period
##        <dbl> <chr>             <chr>      <dbl>      <dbl>      <dbl> <chr>     
##  1   1218575 Children's Nebra… Resp…         NA         NA         NA <NA>      
##  2   9615617 Glastender, Inc.  Insi…         NA         NA         NA <NA>      
##  3  11009123 PGAV Destinations Proj…         NA         NA         NA <NA>      
##  4  56482768 <NA>              Appa…         NA         NA         NA <NA>      
##  5  56924323 <NA>              Stru…         NA         NA         NA <NA>      
##  6  69333422 Staffing Theory   Seni…         NA         NA         NA <NA>      
##  7  83789755 OsteoStrong       Oste…         NA         NA         NA <NA>      
##  8 111513530 United Methodist… Cont…         NA         NA         NA <NA>      
##  9 115639136 Shannon Waltchack Cont…         NA         NA         NA <NA>      
## 10 117675818 Premier Family C… Phys…         NA         NA         NA <NA>      
## # ℹ 87,766 more rows
## # ℹ 15 more variables: location <chr>, company_id <dbl>,
## #   formatted_work_type <chr>, original_listed_time <dbl>, expiry <dbl>,
## #   closed_time <dbl>, applies <dbl>, remote_allowed <dbl>,
## #   application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## #   work_type <chr>, currency <fct>, zip_code <chr>, fips <chr>
posting_clean_pay_null %>% filter(!is.na(max_salary) | !is.na(min_salary) | !is.na(med_salary))
## # A tibble: 0 × 22
## # ℹ 22 variables: job_id <dbl>, company_name <chr>, title <chr>,
## #   max_salary <dbl>, med_salary <dbl>, min_salary <dbl>, pay_period <chr>,
## #   location <chr>, company_id <dbl>, formatted_work_type <chr>,
## #   original_listed_time <dbl>, expiry <dbl>, closed_time <dbl>, applies <dbl>,
## #   remote_allowed <dbl>, application_type <chr>,
## #   formatted_experience_level <chr>, sponsored <dbl>, work_type <chr>,
## #   currency <fct>, zip_code <chr>, fips <chr>

There are no instances where expected salaries (max/med/min salary) is mentioned without the pay period, so we can rest assure Now, we need to convert expected salary to a normalized format. In this case, we will choose annually as the normalised format, since it is the easiest to visualize how much they are making a year, and it is much easier to convert from annually to any other format.

posting_clean <- posting_clean %>% 
  mutate(norm_max_salary = case_when(
    is.na(pay_period) ~ NA_real_,
    pay_period == "HOURLY" ~ max_salary * 40 * 52,
    pay_period == "WEEKLY" ~ max_salary * 52,
    pay_period == "BIWEEKLY" ~ max_salary * 26,
    pay_period == "MONTHLY" ~ max_salary * 12,
    pay_period == "YEARLY" ~ max_salary,
    TRUE ~ NA_real_
  ))
posting_clean <- posting_clean %>% 
  mutate(norm_med_salary = case_when(
    is.na(pay_period) ~ NA_real_,
    pay_period == "HOURLY" ~ med_salary * 40 * 52,
    pay_period == "WEEKLY" ~ med_salary * 52,
    pay_period == "BIWEEKLY" ~ med_salary * 26,
    pay_period == "MONTHLY" ~ med_salary * 12,
    pay_period == "YEARLY" ~ med_salary,
    TRUE ~ NA_real_
  ))

posting_clean <- posting_clean %>% 
  mutate(norm_min_salary = case_when(
    is.na(pay_period) ~ NA_real_,
    pay_period == "HOURLY" ~ min_salary * 40 * 52,
    pay_period == "WEEKLY" ~ min_salary * 52,
    pay_period == "BIWEEKLY" ~ min_salary * 26,
    pay_period == "MONTHLY" ~ min_salary * 12,
    pay_period == "YEARLY" ~ min_salary,
    TRUE ~ NA_real_
  ))

Populating the norm_med_salary column based on max and min salary if NA

posting_clean <- posting_clean %>% 
  mutate(norm_med_salary = ifelse(is.na(norm_med_salary), (norm_max_salary + norm_min_salary) / 2, norm_med_salary))

Converting currency

posting_clean <- posting_clean %>% 
  mutate(norm_med_salary = case_when(
    is.na(norm_med_salary) ~ NA_real_,
    currency == "AUD" ~ norm_med_salary * 0.6184,
    currency == "BBD" ~ norm_med_salary * 0.5,
    currency == "CAD" ~ norm_med_salary * 0.682,
    currency == "EUR" ~ norm_med_salary * 1.1,
    currency == "GBP" ~ norm_med_salary * 1.3, 
    currency == "USD" ~ norm_med_salary,
    TRUE ~ NA_real_
  ))

posting_clean <- posting_clean %>% 
  mutate(norm_max_salary = case_when(
    is.na(norm_max_salary) ~ NA_real_,
    currency == "AUD" ~ norm_max_salary * 0.6184,
    currency == "BBD" ~ norm_max_salary * 0.5,
    currency == "CAD" ~ norm_max_salary * 0.682,
    currency == "EUR" ~ norm_max_salary * 1.1,
    currency == "GBP" ~ norm_max_salary * 1.3, 
    currency == "USD" ~ norm_max_salary,
    TRUE ~ NA_real_
  ))

posting_clean <- posting_clean %>% 
  mutate(norm_min_salary = case_when(
    is.na(norm_min_salary) ~ NA_real_,
    currency == "AUD" ~ norm_min_salary * 0.6184,
    currency == "BBD" ~ norm_min_salary * 0.5,
    currency == "CAD" ~ norm_min_salary * 0.682,
    currency == "EUR" ~ norm_min_salary * 1.1,
    currency == "GBP" ~ norm_min_salary * 1.3, 
    currency == "USD" ~ norm_min_salary,
    TRUE ~ NA_real_
  ))

Converting original_listed_time, expiry, closed_time to date

posting_clean$original_listed_time <- as.POSIXct(posting_clean$original_listed_time / 1000, origin = "1970-01-01", tz = "UTC")
posting_clean$expiry <- as.POSIXct(posting_clean$expiry / 1000, origin = "1970-01-01", tz = "UTC")
posting_clean$closed_time <- as.POSIXct(posting_clean$closed_time / 1000, origin = "1970-01-01", tz = "UTC")

Inspect amount of null values per col

colSums(is.na(posting_clean))
##                     job_id               company_name 
##                          0                       1719 
##                      title                 max_salary 
##                          0                      94056 
##                 med_salary                 min_salary 
##                     117569                      94056 
##                 pay_period                   location 
##                      87776                          0 
##                 company_id        formatted_work_type 
##                       1717                          0 
##       original_listed_time                     expiry 
##                          0                          0 
##                closed_time                    applies 
##                     122776                     100529 
##             remote_allowed           application_type 
##                     108603                          0 
## formatted_experience_level                  sponsored 
##                      29409                          0 
##                  work_type                   currency 
##                          0                          0 
##                   zip_code                       fips 
##                      20872                      27415 
##            norm_max_salary            norm_med_salary 
##                      94056                      87776 
##            norm_min_salary 
##                      94056

Percentage of null value per col

round(colSums(is.na(posting_clean)) / nrow(posting_clean) * 100, 2)
##                     job_id               company_name 
##                       0.00                       1.39 
##                      title                 max_salary 
##                       0.00                      75.94 
##                 med_salary                 min_salary 
##                      94.93                      75.94 
##                 pay_period                   location 
##                      70.87                       0.00 
##                 company_id        formatted_work_type 
##                       1.39                       0.00 
##       original_listed_time                     expiry 
##                       0.00                       0.00 
##                closed_time                    applies 
##                      99.13                      81.17 
##             remote_allowed           application_type 
##                      87.69                       0.00 
## formatted_experience_level                  sponsored 
##                      23.75                       0.00 
##                  work_type                   currency 
##                       0.00                       0.00 
##                   zip_code                       fips 
##                      16.85                      22.14 
##            norm_max_salary            norm_med_salary 
##                      75.94                      70.87 
##            norm_min_salary 
##                      75.94

We will drop all the locations where both zips and fips is null, we will then later repopulate the location column based on zip code or fips code

posting_clean <- posting_clean %>%  
  mutate(location = ifelse(is.na(zip_code) & is.na(fips), NA, location))

Separate location into city and state

posting_clean <- posting_clean %>% 
  separate(location, into = c("city", "state"), sep = ",")

Drop old max/med/min salary cols, currency col

posting_clean <- posting_clean %>% 
  select(-max_salary, -min_salary, -med_salary, -currency)

Relocate cols for easier cleaning

posting_clean <- posting_clean %>% 
  relocate(norm_max_salary, norm_med_salary, norm_min_salary, .before = pay_period)
posting_clean <- posting_clean %>% 
  relocate(zip_code, fips, .after = state)

Repopulating city and state based on zip_code and fips

Loading in the zip_code_database

zip_code_database <- read_csv("zip_code_database.csv")
## Rows: 42735 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): zip, type, primary_city, acceptable_cities, unacceptable_cities, s...
## dbl  (4): decommissioned, latitude, longitude, irs_estimated_population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zip_code_database_clean <- zip_code_database %>% 
  select(zip, primary_city, state, latitude, longitude)

Adding a suffix to zip_code_database_clean

zip_code_database_clean <- zip_code_database_clean %>% 
  rename_with(~ paste0(.x, ".zipcode"))

Left joining posting_clean with zip_code_database_clean

posting_clean <- posting_clean %>% 
  left_join(zip_code_database_clean, by = c("zip_code" = "zip.zipcode"))

Populating city and state using zip_code First, we need to find out how much observation where city or state is null while zip_code or fips is not

posting_clean_city_or_state_null <- posting_clean %>% 
  filter(is.na(city) | is.na(state))
nrow(posting_clean_city_or_state_null)
## [1] 20872
posting_zip_and_fips_not_null <- posting_clean_city_or_state_null %>%  
  filter(!is.na(zip_code) & !is.na(fips))
nrow(posting_zip_and_fips_not_null)
## [1] 0

There is no way to populate those columns, all instances where city or state is null is due to zip_code & fips being null We can removes the columns that we don’t need

posting_clean <- posting_clean %>% 
  select(-primary_city.zipcode, -state.zipcode, -latitude.zipcode, -longitude.zipcode)

Let see the distribution of states:

posting_clean %>% 
  filter(!is.na(state)) %>% 
  ggplot(aes(x = reorder(factor(state), desc(factor(state))))) +
  geom_bar(width = 0.8) +  
  coord_flip() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + 
  theme_minimal() +
  theme(axis.text.y = element_text(size = 11)) +
  labs(x = "State", y = "Count")

We can confirm that there is no outliers in the state columns, and the format of state is correct

Reading the industries and job_industries tables into posting

industry <- read_csv("industries.csv")
## Rows: 422 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): industry_name
## dbl (1): industry_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
job_industry <- read_csv("job_industries.csv")
## Rows: 164808 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): job_id, industry_id
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#posting_clean <- posting_clean %>% 
#  left_join(job_industry, by = "job_id")

Left joining will create duplicates on the posting_clean because 1 job_id can have more than 1 industry_id, so we will aggregate the job_industry by job_id So first, we will have to join job_industry with industry on industry_id, doing this will create a job_industry table with job_id, industry_id, and industry_name. After that, we can aggregate the industry_name grouped by job_id, this means a job can have multiple industries, and our posting_clean table will not create any duplicated posting.

job_industry <- job_industry %>% 
  left_join(industry, by = "industry_id")
job_industry_agg <- job_industry %>% 
  group_by(job_id) %>% 
  summarise(industries_name = paste(industry_name, collapse = ", "))

Now, we can left join posting_clean with job_industry_agg and we won’t have any duplicated rows of job postings, while keeping all the industries_name for further analysis

posting_clean <- posting_clean %>% 
  left_join(job_industry_agg, by = "job_id") %>% 
  relocate(industries_name, .after = title)

Now, we want categorize the industries_name down to smaller groups for further analysis, because as of right now, there are over 3,300+ unique industries.

Attempt 1: Methods: We will try to find out which are the most common keywords of each industries, and the we can use that to try to categorize all the industries based on common keywords Search for top 50 keywords

Attempt 2: Expand the searching parameters to 100 most common instead

Attempt 3: Increase the size to 200

library(tidytext)
top50_keywords <- posting_clean %>% 
  unnest_tokens(word, industries_name) %>% 
  anti_join(stop_words, by = "word") %>% # Remove common stop words
  count(word, sort = TRUE) %>% 
  top_n(50, n)
#View(top50_keywords)

top100_keywords <- posting_clean %>% 
  unnest_tokens(word, industries_name) %>% 
  anti_join(stop_words, by = "word") %>% # Remove common stop words
  count(word, sort = TRUE) %>% 
  top_n(100, n)
#View(top100_keywords)
top200_keywords <- posting_clean %>% 
  unnest_tokens(word, industries_name) %>% 
  anti_join(stop_words, by = "word") %>% # Remove common stop words
  count(word, sort = TRUE) %>% 
  top_n(200, n)
#View(top200_keywords)
#temporary_test <- top200_keywords %>% 
#  select(word)
#temporary_test2 <- read_csv("industry_keywords_by_field.csv")
#temporary_test3 <- temporary_test %>% 
#  semi_join(temporary_test2, by = c("word" = "Keyword"))
#tempurary_test4 <- temporary_test3 %>%  
#  inner_join(temporary_test2, by = c("word" = "Keyword"))

Mapping out individual keywords

posting_clean[grepl("Defense and space", posting_clean$industries_name, ignore.case = TRUE), ]
## # A tibble: 1,446 × 23
##        job_id company_name title industries_name norm_max_salary norm_med_salary
##         <dbl> <chr>        <chr> <chr>                     <dbl>           <dbl>
##  1 3803052628 Insight Glo… Flig… Defense and Sp…          170000          155000
##  2 3815847830 Northrop Gr… Staf… Defense and Sp…          241400          201200
##  3 3850493836 Crown Point… Audi… Defense and Sp…              NA              NA
##  4 3884433891 Becker Wrig… Hill… Appliances, El…          125000          112500
##  5 3884436032 ITRS Recrui… Cost… Aviation and A…              NA              NA
##  6 3884436140 North Star … FPGA… Defense and Sp…              NA              NA
##  7 3884436258 AirBorn Inc. Seni… Defense and Sp…              NA              NA
##  8 3884437283 Pegasus Ste… Qual… Defense and Sp…              NA              NA
##  9 3884437415 JW Industri… Staf… Aviation and A…          160000          140000
## 10 3884438364 The Davis C… Exec… Defense and Sp…              NA              29
## # ℹ 1,436 more rows
## # ℹ 17 more variables: norm_min_salary <dbl>, pay_period <chr>, city <chr>,
## #   state <chr>, zip_code <chr>, fips <chr>, company_id <dbl>,
## #   formatted_work_type <chr>, original_listed_time <dttm>, expiry <dttm>,
## #   closed_time <dttm>, applies <dbl>, remote_allowed <dbl>,
## #   application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## #   work_type <chr>
#posting_clean[grepl("advertising", posting_clean$industries_name, ignore.case = TRUE), ]
#top200_keywords_test2 <- top200_keywords %>% 
#  left_join(tempurary_test4, by = "word")

Now, I want to see what the leading industries are, lets convert the current table into long format and aggregate by industries_name

posting_clean_long <- posting_clean %>% 
  separate_rows(industries_name, sep = ",") %>% 
  mutate(industries_name = str_trim(industries_name))

Grouping by industries_name

posting_clean_long_agg <- posting_clean_long %>% 
  count(industries_name, sort = TRUE)
nrow(posting_clean_long_agg)
## [1] 424

Since there are about only 400 industries, we don’t have to group them down to bigger sectors Instead, let’s see the distribution of these industries

ggplot(posting_clean_long_agg, aes(x = n)) +
  geom_histogram() + 
  scale_x_log10(labels = scales::comma) +
  labs(x = "Distribution of amount of jobs per industries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Top50 industries

posting_clean_long_agg %>% 
  filter(!is.na(industries_name)) %>% 
  slice_max(n, n = 100) %>% 
  ggplot(aes(x = reorder(industries_name, n), y = n)) + 
  geom_bar(stat = "identity") +
  coord_flip() +
  theme(axis.text.x = element_text(size = 5))

Let see job demand from 2023-2024 First, lets create a month-year column for data aggregation

Testing

No time series available, because we are missing a lot of data

posting_clean_long_yearmonth <- posting_clean_long %>% 
  mutate(yearmonth = as.Date(format(original_listed_time, "%Y-%m-01"))) %>% 
  relocate(yearmonth, .after = original_listed_time)
posting_clean_long %>% 
  count(industries_name, sort = TRUE) %>% 
  slice_max(n, n = 100) %>% 
  pull(industries_name)
##   [1] "Hospitals and Health Care"                     
##   [2] "Retail"                                        
##   [3] "IT Services and IT Consulting"                 
##   [4] "Staffing and Recruiting"                       
##   [5] "Financial Services"                            
##   [6] "Software Development"                          
##   [7] "Manufacturing"                                 
##   [8] "Construction"                                  
##   [9] "Banking"                                       
##  [10] "Technology"                                    
##  [11] "Insurance"                                     
##  [12] "Hospitality"                                   
##  [13] "Pharmaceutical Manufacturing"                  
##  [14] "Real Estate"                                   
##  [15] "Telecommunications"                            
##  [16] "Non-profit Organizations"                      
##  [17] "Industrial Machinery Manufacturing"            
##  [18] "Biotechnology Research"                        
##  [19] "Motor Vehicle Manufacturing"                   
##  [20] "Food and Beverage Services"                    
##  [21] "Advertising Services"                          
##  [22] "Business Consulting and Services"              
##  [23] "Accounting"                                    
##  [24] "Higher Education"                              
##  [25] "Medical Equipment Manufacturing"               
##  [26] "Government Administration"                     
##  [27] "Information and Internet"                      
##  [28] "Wellness and Fitness Services"                 
##  [29] "Defense and Space Manufacturing"               
##  [30] NA                                              
##  [31] "Logistics"                                     
##  [32] "Supply Chain and Storage"                      
##  [33] "Transportation"                                
##  [34] "Food and Beverage Manufacturing"               
##  [35] "Retail Apparel and Fashion"                    
##  [36] "Civil Engineering"                             
##  [37] "Electrical"                                    
##  [38] "Appliances"                                    
##  [39] "and Electronics Manufacturing"                 
##  [40] "Medical Practices"                             
##  [41] "Information Services"                          
##  [42] "Oil and Gas"                                   
##  [43] "Information and Media"                         
##  [44] "Environmental Services"                        
##  [45] "Law Practice"                                  
##  [46] "Utilities"                                     
##  [47] "Aviation and Aerospace Component Manufacturing"
##  [48] "Chemical Manufacturing"                        
##  [49] "Restaurants"                                   
##  [50] "Wholesale Building Materials"                  
##  [51] "Education Administration Programs"             
##  [52] "Truck Transportation"                          
##  [53] "Computer and Network Security"                 
##  [54] "Entertainment Providers"                       
##  [55] "Engineering Services"                          
##  [56] "Investment Management"                         
##  [57] "Legal Services"                                
##  [58] "Consumer Services"                             
##  [59] "Mental Health Care"                            
##  [60] "Human Resources Services"                      
##  [61] "Research Services"                             
##  [62] "Facilities Services"                           
##  [63] "Information Technology & Services"             
##  [64] "Investment Banking"                            
##  [65] "Civic and Social Organizations"                
##  [66] "Machinery Manufacturing"                       
##  [67] "Primary and Secondary Education"               
##  [68] "Wholesale"                                     
##  [69] "Airlines and Aviation"                         
##  [70] "Packaging and Containers Manufacturing"        
##  [71] "Architecture and Planning"                     
##  [72] "Semiconductor Manufacturing"                   
##  [73] "Automation Machinery Manufacturing"            
##  [74] "Marketing Services"                            
##  [75] "Security and Investigations"                   
##  [76] "Individual and Family Services"                
##  [77] "Professional Services"                         
##  [78] "Veterinary Services"                           
##  [79] "Design Services"                               
##  [80] "Computer Hardware Manufacturing"               
##  [81] "Public Safety"                                 
##  [82] "Services for Renewable Energy"                 
##  [83] "Broadcast Media Production and Distribution"   
##  [84] "Public Relations and Communications Services"  
##  [85] "Travel Arrangements"                           
##  [86] "Health and Human Services"                     
##  [87] "Personal Care Product Manufacturing"           
##  [88] "Retail Office Equipment"                       
##  [89] "Freight and Package Transportation"            
##  [90] "Business Content"                              
##  [91] "Computers and Electronics Manufacturing"       
##  [92] "Venture Capital and Private Equity Principals" 
##  [93] "Renewable Energy Semiconductor Manufacturing"  
##  [94] "Wireless Services"                             
##  [95] "E-Learning Providers"                          
##  [96] "Retail Luxury Goods and Jewelry"               
##  [97] "Education"                                     
##  [98] "Retail Groceries"                              
##  [99] "Mining"                                        
## [100] "Warehousing and Storage"
#posting_clean_long_top_3 <- posting_clean_long_yearmonth %>% 
#  filter(industries_name %in% top3_industries) %>% 
#  count(yearmonth, industries_name) 
#ggplot(posting_clean_long_top_3, aes(x = yearmonth, y = n, color = industries_name)) +
#  geom_line() +
#  scale_y_log10()

Total jobs posed on linkedIn

posting_clean_long_yearmonth_total <- posting_clean_long_yearmonth %>% 
  count(yearmonth)

Distribution of job posting my month

ggplot(posting_clean_long_yearmonth, aes(x = yearmonth, y = after_stat(count))) +
  geom_bar() +
  scale_y_log10(labels = scales::comma)

For our data set, most job posting happends in March - April, we will use this knowledge to construct a choropleth map based on all of the job posted during these periods

library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(ggthemes)

#posting_clean_long$state <- trimws(posting_clean_long$state)
#head(match(posting_clean_long$state, state.abb))

#posting_clean_long$state_full <- tolower(state.name[match(posting_clean_long$state, state.abb)])

posting_clean$state <- trimws(posting_clean$state)
head(match(posting_clean$state, state.abb))
## [1] 30  6 35 32 15 33
posting_clean$state_full <- tolower(state.name[match(posting_clean$state, state.abb)])

us_map_data <- map_data("state")

state_posting <- posting_clean %>% 
  count(state_full) %>% 
  filter(!is.na(state_full))

#sum(state_posting$n)

state_posting_map_joined <- left_join(us_map_data, state_posting, by = c("region" = "state_full"))
ggplot(state_posting_map_joined, aes(x = long, y = lat, group = group, fill = n)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  theme_map() +
  scale_fill_gradient2(
    low = "red",
    mid = "grey",
    high = "forestgreen",
    midpoint = median(state_posting_map_joined$n, na.rm = TRUE)
  )

Let see the distribution of the amount ob jobs acros state first

ggplot(state_posting, aes(x = state_full, y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_y_log10()

ggplot(state_posting, aes(x = state_full, y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() 

summary(state_posting$n)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   118.0   431.5  1126.0  2039.7  2781.2 11386.0

Let’s try another map with the log10() used

state_posting_logn <- state_posting %>% 
  mutate(logn = log10(n))
state_posting_logn_map_joined <- left_join(us_map_data, state_posting_logn, by = c("region" = "state_full"))
ggplot(state_posting_logn_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  theme_map() +
  scale_fill_gradient2(
    low = "firebrick",
    mid = "grey90",
    high = "darkgreen",
    midpoint = mean(state_posting_logn_map_joined$logn, na.rm = TRUE)
  ) +
  theme(legend.position = "none")

Let see distribution of jobs in the tech industries Because a job can be in multiple industries, and we are using the long format data, I will be dropping any duplicated job posting on job_id, because they most likely are in the same industries anyway

tech_industries <- c("IT Services and IT Consulting", "Software Development", "Technology", "Information and Internet", "Information and Media", 
                     "Information Services", "Computer and Network Security", "Information Technology & Services")
posting_clean_long %>% 
  count(industries_name, sort = TRUE) %>% 
  slice_max(n, n = 100) %>% 
  pull(industries_name)
##   [1] "Hospitals and Health Care"                     
##   [2] "Retail"                                        
##   [3] "IT Services and IT Consulting"                 
##   [4] "Staffing and Recruiting"                       
##   [5] "Financial Services"                            
##   [6] "Software Development"                          
##   [7] "Manufacturing"                                 
##   [8] "Construction"                                  
##   [9] "Banking"                                       
##  [10] "Technology"                                    
##  [11] "Insurance"                                     
##  [12] "Hospitality"                                   
##  [13] "Pharmaceutical Manufacturing"                  
##  [14] "Real Estate"                                   
##  [15] "Telecommunications"                            
##  [16] "Non-profit Organizations"                      
##  [17] "Industrial Machinery Manufacturing"            
##  [18] "Biotechnology Research"                        
##  [19] "Motor Vehicle Manufacturing"                   
##  [20] "Food and Beverage Services"                    
##  [21] "Advertising Services"                          
##  [22] "Business Consulting and Services"              
##  [23] "Accounting"                                    
##  [24] "Higher Education"                              
##  [25] "Medical Equipment Manufacturing"               
##  [26] "Government Administration"                     
##  [27] "Information and Internet"                      
##  [28] "Wellness and Fitness Services"                 
##  [29] "Defense and Space Manufacturing"               
##  [30] NA                                              
##  [31] "Logistics"                                     
##  [32] "Supply Chain and Storage"                      
##  [33] "Transportation"                                
##  [34] "Food and Beverage Manufacturing"               
##  [35] "Retail Apparel and Fashion"                    
##  [36] "Civil Engineering"                             
##  [37] "Electrical"                                    
##  [38] "Appliances"                                    
##  [39] "and Electronics Manufacturing"                 
##  [40] "Medical Practices"                             
##  [41] "Information Services"                          
##  [42] "Oil and Gas"                                   
##  [43] "Information and Media"                         
##  [44] "Environmental Services"                        
##  [45] "Law Practice"                                  
##  [46] "Utilities"                                     
##  [47] "Aviation and Aerospace Component Manufacturing"
##  [48] "Chemical Manufacturing"                        
##  [49] "Restaurants"                                   
##  [50] "Wholesale Building Materials"                  
##  [51] "Education Administration Programs"             
##  [52] "Truck Transportation"                          
##  [53] "Computer and Network Security"                 
##  [54] "Entertainment Providers"                       
##  [55] "Engineering Services"                          
##  [56] "Investment Management"                         
##  [57] "Legal Services"                                
##  [58] "Consumer Services"                             
##  [59] "Mental Health Care"                            
##  [60] "Human Resources Services"                      
##  [61] "Research Services"                             
##  [62] "Facilities Services"                           
##  [63] "Information Technology & Services"             
##  [64] "Investment Banking"                            
##  [65] "Civic and Social Organizations"                
##  [66] "Machinery Manufacturing"                       
##  [67] "Primary and Secondary Education"               
##  [68] "Wholesale"                                     
##  [69] "Airlines and Aviation"                         
##  [70] "Packaging and Containers Manufacturing"        
##  [71] "Architecture and Planning"                     
##  [72] "Semiconductor Manufacturing"                   
##  [73] "Automation Machinery Manufacturing"            
##  [74] "Marketing Services"                            
##  [75] "Security and Investigations"                   
##  [76] "Individual and Family Services"                
##  [77] "Professional Services"                         
##  [78] "Veterinary Services"                           
##  [79] "Design Services"                               
##  [80] "Computer Hardware Manufacturing"               
##  [81] "Public Safety"                                 
##  [82] "Services for Renewable Energy"                 
##  [83] "Broadcast Media Production and Distribution"   
##  [84] "Public Relations and Communications Services"  
##  [85] "Travel Arrangements"                           
##  [86] "Health and Human Services"                     
##  [87] "Personal Care Product Manufacturing"           
##  [88] "Retail Office Equipment"                       
##  [89] "Freight and Package Transportation"            
##  [90] "Business Content"                              
##  [91] "Computers and Electronics Manufacturing"       
##  [92] "Venture Capital and Private Equity Principals" 
##  [93] "Renewable Energy Semiconductor Manufacturing"  
##  [94] "Wireless Services"                             
##  [95] "E-Learning Providers"                          
##  [96] "Retail Luxury Goods and Jewelry"               
##  [97] "Education"                                     
##  [98] "Retail Groceries"                              
##  [99] "Mining"                                        
## [100] "Warehousing and Storage"
posting_clean[grepl("Technology", posting_clean$industries_name, ignore.case = TRUE), ]
## # A tibble: 5,327 × 24
##        job_id company_name title industries_name norm_max_salary norm_med_salary
##         <dbl> <chr>        <chr> <chr>                     <dbl>           <dbl>
##  1  175485704 GOYT         Soft… Technology, In…              NA              NA
##  2 2269442456 navXcom      Comp… Space Research…              NA              NA
##  3 2558399667 I.T. Soluti… Vali… Pharmaceutical…          145600          135200
##  4 2974397965 Lynx Systems Mark… Information Te…           85000           80000
##  5 3736684097 Cira Tek, I… Prog… Information Te…              NA              NA
##  6 3739478140 Merakris Th… Dist… Biotechnology …              NA              NA
##  7 3811513595 Cage Riot    Vide… Technology, In…              NA           41600
##  8 3813645405 Tenazx Inc   Data… Technology, In…              NA              NA
##  9 3829184899 SanLuna LLC  Soci… Technology, In…              NA              NA
## 10 3835806869 vMOX         Head… Telecommunicat…          200000          175000
## # ℹ 5,317 more rows
## # ℹ 18 more variables: norm_min_salary <dbl>, pay_period <chr>, city <chr>,
## #   state <chr>, zip_code <chr>, fips <chr>, company_id <dbl>,
## #   formatted_work_type <chr>, original_listed_time <dttm>, expiry <dttm>,
## #   closed_time <dttm>, applies <dbl>, remote_allowed <dbl>,
## #   application_type <chr>, formatted_experience_level <chr>, sponsored <dbl>,
## #   work_type <chr>, state_full <chr>

Filtering for job positing within the tech industries

posting_clean_long_tech <- posting_clean_long %>% 
  filter(industries_name %in% tech_industries)

sum(duplicated(posting_clean_long_tech$job_id))
## [1] 4924
posting_clean_long_tech_unique_id <- posting_clean_long_tech %>% 
  filter(!duplicated(job_id))

sum(duplicated(posting_clean_long_tech_unique_id$job_id))
## [1] 0

Cleaning the new dataset

posting_clean_long_tech_unique_id$state <- trimws(posting_clean_long_tech_unique_id$state)

posting_clean_long_tech_unique_id$state_full <- tolower(state.name[match(posting_clean_long_tech_unique_id$state, state.abb)])

posting_clean_agg_tech_unique_id <- posting_clean_long_tech_unique_id %>% 
  count(state_full) %>% 
  filter(!is.na(state_full))

state_tech_posting_map_joined <- left_join(us_map_data, posting_clean_agg_tech_unique_id, by = c("region" = "state_full"))
state_tech_posting_map_joined$logn <- log10(state_tech_posting_map_joined$n)
ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  theme_map() +
  scale_fill_gradient2(
    low = "firebrick",
    mid = "grey90",
    high = "darkgreen",
    midpoint = mean(state_tech_posting_map_joined$logn, na.rm = TRUE)
  ) +
  theme(legend.position = "none")

ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  theme_map() +
  scale_fill_gradient(
    low = "grey",
    high = "darkgreen"
  ) +
  theme(legend.position = "none")

ggplot(state_tech_posting_map_joined, aes(x = long, y = lat, group = group, fill = logn)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  theme_map() +
  scale_fill_gradient2(
    low = "grey",
    mid = "#97B197",
    high = "darkgreen",
    midpoint = mean(state_tech_posting_map_joined$logn, na.rm = TRUE)
  ) +
  theme(legend.position = "none")

Let’s load the data for graduates in the United States

library(readxl)
degree_dataset <- read_excel("degree_dataset.xlsx")
#View(degree_dataset)

Convert it to long format

degree_long <- degree_dataset %>% 
  pivot_longer(
    cols = 2:ncol(degree_dataset),
    names_to = "graduate_year",
    values_to = "total_graduate"
  )
ggplot(degree_long, aes(x = factor(graduate_year), y = total_graduate, group = `Field of study`)) +
  geom_line() +
  theme(axis.text.x = element_text(angle = 30, vjust = 0.6)) 

Top 10 and highlight

degree_long_top10 <- degree_long %>% 
  group_by(`Field of study`) %>% 
  summarise(total_per_field = sum(total_graduate)) %>% 
  arrange(total_per_field) %>% 
  slice_max(total_per_field, n = 10) %>% 
  pull(`Field of study`)

Because the graduate years are no on a continuous scale, we will have to convert int normal years first

degree_long <- degree_long %>% 
  mutate(graduate_year = gsub(pattern = "-..$", replacement = "", x = graduate_year)) %>% 
  mutate(graduate_year = as.numeric(graduate_year) + 1) 

Growing Fields

degree_long %>% 
  filter(`Field of study` %in% degree_long_top10) %>% 
  mutate(highlight = ifelse(`Field of study` %in% c("Business", "Health professions and related programs", "Engineering", "Biological and biomedical sciences", "Computer and information sciences and support services", "Psychology"), "highlight", "normal"),
         highlight2 = case_when(
           `Field of study` == "Business" ~ "Business",
           `Field of study` == "Health professions and related programs" ~ "Healthcare",
           `Field of study` == "Engineering" ~ "Engineering",
           `Field of study` == "Computer and information sciences and support services" ~ "Computer Science",
           `Field of study` == "Biological and biomedical sciences" ~ "Biomedical Science",
           `Field of study` == "Psychology" ~ "Psychology",
           TRUE ~ "Others"
         )) %>% 
  ggplot( aes(x = graduate_year, y = total_graduate, group = `Field of study`, color = highlight2, alpha = highlight)) +
  geom_line(linewidth = 0.8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, vjust = 0.6)) +
  scale_alpha_manual(values = c("highlight" = 1, "normal" = 0.3)) +
  labs(x = "Years", color = "Field of Study", title = "Top Growing Degrees") +
  scale_color_manual(values = c("#16317DFF" ,"#007E2FFF", "#00B7A7FF", "#B86092FF", "#A40000FF","#F39C12", "grey"),
                     breaks = c("Business", "Healthcare", "Biomedical Science", "Engineering", "Psychology", "Computer Science")) +
  theme(#legend.position = "none", 
        axis.title.y.left = element_blank(),
        plot.title = element_text(hjust = 0.5, size = 20)) +
  guides(alpha = "none") +
  scale_y_continuous(breaks = seq(0, 400000, 100000),
                     labels = c("0", "100k", "200k", "300k", "400k")) +
  theme(panel.grid.major.y = element_line(),
        axis.line.y.left = element_blank())

degree_long %>% 
  mutate(highlight = ifelse(`Field of study` %in% c("Business", "Health professions and related programs", "Engineering", "Biological and biomedical sciences", "Computer and information sciences and support services", "Psychology"), "highlight", "normal"),
         highlight2 = case_when(
           `Field of study` == "Business" ~ "Business",
           `Field of study` == "Health professions and related programs" ~ "Healthcare",
           `Field of study` == "Engineering" ~ "Engineering",
           `Field of study` == "Computer and information sciences and support services" ~ "Computer Science",
           `Field of study` == "Biological and biomedical sciences" ~ "Biomedical Science",
           `Field of study` == "Psychology" ~ "Psychology",
           TRUE ~ "Others"
         )) %>% 
  ggplot( aes(x = graduate_year, y = total_graduate, group = `Field of study`, color = highlight2, alpha = highlight)) +
  geom_line(linewidth = 0.8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, vjust = 0.6)) +
  scale_alpha_manual(values = c("highlight" = 1, "normal" = 0.2)) +
  labs(x = "Years", color = "Field of Study", title = "Top Growing Degrees") +
  scale_color_manual(values = c("#16317DFF" ,"#007E2FFF", "#00B7A7FF", "#B86092FF", "#A40000FF","#F39C12", "grey"),
                     breaks = c("Business", "Healthcare", "Biomedical Science", "Engineering", "Psychology", "Computer Science")) +
  theme(#legend.position = "none", 
        axis.title.y.left = element_blank(),
        plot.title = element_text(hjust = 0.5, size = 20)) +
  guides(alpha = "none")

ind_posting <- read_csv("Indeed_Postings.csv")
## Rows: 1827 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): IHLIDXUS
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ind_posting <- ind_posting %>% 
  rename(Value = IHLIDXUS)
ggplot(ind_posting, aes(x = observation_date, y = Value)) + 
  geom_line(linewidth = 1, color = "#5C90B2") +
  ylim(-5, NA) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y-%b") +
  theme_classic() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.line.y.left = element_blank(),
        panel.grid.major.y = element_line(),
        axis.line.x.bottom = element_blank(),
        axis.ticks.x.bottom = element_blank(),
        axis.text.x = element_blank(),
        plot.title = element_text(size = 20),
        plot.caption = element_text(color = adjustcolor("black", alpha.f = 0.5))) +
  geom_hline(yintercept = 0, color = "black") +
  annotate("rect", xmin = as.Date(min(ind_posting$observation_date), format = "%Y-%m-%d"), xmax = as.Date("2023-05-11", format = "%Y-%m-%d"), ymin = 0, ymax = max(ind_posting$Value) + 5, fill = "red", alpha = 0.08) +
  annotate("text", x = as.Date(min(ind_posting$observation_date), format = "%Y-%m-%d"), y = -5, label = "2020-03-28", size = 3.2, hjust = 0.4) +
  annotate("text", x = as.Date("2022-04-01", format = "%Y-%m-%d"),y = -5, label = "2022-04-01", size = 3.2) +
  annotate("segment", x = as.Date("2022-04-01", format = "%Y-%m-%d"), xend = as.Date("2022-04-01", format = "%Y-%m-%d"), y = 0, yend = max(ind_posting$Value), color = "black", linetype = "dashed") + 
  annotate("text", x = as.Date("2023-05-11", format = "%Y-%m-%d"), y = -5, label = "2023-05-11", size = 3.2) + 
  annotate("text", x = as.Date("2024-01-01", format = "%Y-%m-%d"), y = -5, label = "2024-01-01", size = 3.2) + 
  annotate("text", x = as.Date("2025-01-01", format = "%Y-%m-%d"), y = -5, label = "2025-01-01", size = 3.2) +
  labs(title = "Job Postings On Indeed in the United States", caption = "Units: Index, Feb, 1, 2020 = 100, Seasonally Adjusted, Frequency: Daily, 7-Day")

devtools::install_github("UrbanInstitute/urbnmapr")
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.4 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'urbnmapr' from a github remote, the SHA1 (ef9f4488) has not changed since last install.
##   Use `force = TRUE` to force installation
library(urbnmapr)
states_sf <- get_urbn_map(map = "states", sf = TRUE)
counties_sf <- get_urbn_map(map = "counties", sf = TRUE)
ggplot(states_sf) +
  geom_sf() +
  coord_sf(crs=st_crs(4326)) +
  theme_map()
posting_clean_fips_agrr <- posting_clean %>% 
  filter(!is.na(fips)) %>% 
  count(fips) 
posting_fips_join <- left_join(counties_sf, posting_clean_fips_agrr, by = c("county_fips" = "fips"))
## old-style crs object detected; please recreate object with a recent sf::st_crs()
## Warning in CPL_crs_from_input(x): GDAL Message 1: CRS EPSG:2163 is deprecated.
## Its non-deprecated replacement EPSG:9311 will be used instead. To use the
## original CRS, set the OSR_USE_NON_DEPRECATED configuration option to NO.
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
ggplot(posting_fips_join, aes(fill = n)) +
  geom_sf() +
  coord_sf(crs=st_crs(4326)) +
  theme_map()

posting_large <- read_csv("linkedin_job_postings.csv")
## Rows: 1348454 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): job_link, job_title, company, job_location, search_city, search_co...
## lgl  (3): got_summary, got_ner, is_being_worked
## dttm (1): last_processed_time
## date (1): first_seen
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
uscities <- read_csv("uscities.csv")
## Rows: 31254 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): city, city_ascii, state_id, state_name, county_fips, county_name, s...
## dbl (6): lat, lng, population, density, ranking, id
## lgl (2): military, incorporated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
posting_large_us <- posting_large %>% 
  filter(search_country == "United States")
nrow(posting_large_us)
## [1] 1149342
uscities_clean <- uscities %>% 
  select(city, state_id, county_fips) %>% 
  mutate(location = paste(city, state_id, sep = ", ")) %>% 
  select(-city, - state_id)
posting_large_us_join <- posting_large_us %>% 
  left_join(uscities_clean, by = c("job_location" = "location"))
## Warning in left_join(., uscities_clean, by = c(job_location = "location")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 835 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
nrow(posting_large_us_join)
## [1] 1150097
posting_large_aggr <- posting_large_us_join %>% 
  filter(!is.na(county_fips)) %>% 
  count(county_fips)
posting_large_fips_join <- left_join(counties_sf, posting_large_aggr, by = "county_fips" )
## old-style crs object detected; please recreate object with a recent sf::st_crs()
ggplot() +
  geom_sf(data = posting_large_fips_join, aes(fill = log(n)), linewidth = 0.1) +
  coord_sf(crs=st_crs(4326)) +
  theme_map() +
  theme(legend.position = c(0.2,-0.15),
        legend.title = element_blank(),
        plot.margin = margin(1,1,50,1)) +
  scale_fill_gradientn(
    colors = c("lightblue","#FFFF99", "orange", "darkred"),  
    na.value = "lightblue",
    guide = guide_colorbar(barheight = 0.8, barwidth = 20, direction = "horizontal"),
    limits = c(min(log(posting_large_fips_join$n), na.rm = TRUE), 
               max(log(posting_large_fips_join$n), na.rm = TRUE)),
    labels = c("0",expression(10^2.5),expression(10^5),expression(10^7.5),expression(10^10))
  ) +
  geom_sf(data = states_sf, color = "black", fill = NA, linewidth = 0.25) 
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## old-style crs object detected; please recreate object with a recent sf::st_crs()

posting_large_fips_join_logn <- posting_large_fips_join %>% 
  mutate(logn = log(n))
ggplot(posting_large_fips_join_logn, aes(fill = logn)) +
  geom_sf() +
  coord_sf(crs = st_crs(4326)) +
  theme_map() +
  theme(legend.position = "right") +
  scale_fill_gradient2(
    low = "lightblue",
    mid = "royalblue",
    high = "darkblue", 
    midpoint = mean(posting_large_fips_join_logn$logn),
    na.value = "lightblue"
  )


#guide = guide_colorbar(barheight = 10, barwidth = 1)

Dumbbell graph

library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
salary_aggr_by_lv <- posting_clean_long %>% 
  filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>% 
  filter(!is.na(industries_name) & !is.na(formatted_experience_level)) %>% 
  group_by(industries_name, formatted_experience_level) %>% 
  summarise(med_min = median(norm_min_salary),
            med_med = median(norm_med_salary),
            med_max = median(norm_max_salary))
## `summarise()` has grouped output by 'industries_name'. You can override using
## the `.groups` argument.
salary_aggr_by_industry <- posting_clean_long %>% 
  group_by(industries_name) %>% 
  filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>% 
  filter(!is.na(industries_name)) %>% 
  summarise(med_min = median(norm_min_salary),
            med_med = median(norm_med_salary),
            med_max = median(norm_max_salary))

Because we want a somewhat accurate baseline, we will remove all industries with less than 20 mentioned, so that the amount median pay is somewhat reflective of the real mean

unique_industry <- posting_clean_long %>% 
  count(industries_name) %>% 
  arrange(n) %>% 
  filter(n <= 20) %>% 
  pull(industries_name)
top10_pay_industry <- salary_aggr_by_industry %>% 
  filter(!industries_name %in% unique_industry) %>% 
  slice_max(med_med, n = 10) %>% 
  pull(industries_name)

Top 10 best paying industries

salary_aggr_by_industry %>% 
  filter(industries_name %in% top10_pay_industry) %>% 
  ggplot() +
  geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
  geom_point(aes(x = med_med,y = reorder(industries_name, med_med))) 
## Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Test with lowest

top20_pay_industry <- salary_aggr_by_industry %>% 
  filter(!industries_name %in% unique_industry) %>% 
  slice_max(med_med, n = 20) %>% 
  pull(industries_name)

salary_aggr_by_industry %>% 
  filter(industries_name %in% top20_pay_industry) %>% 
  ggplot() +
  geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
  geom_point(aes(x = med_med,y = reorder(industries_name, med_med))) 

salary_aggr_by_industry %>% 
  filter(industries_name %in% top20_pay_industry) %>% 
  filter(!industries_name %in% c("Movies", "and Sound")) %>% 
  ggplot() +
  geom_segment(aes(x = med_min, y = reorder(industries_name, med_med), xend = med_max, yend = reorder(industries_name, med_med)),
               color = "grey",
               alpha = 0.8,
               linewidth = 4.5) +
  geom_point(aes(x = med_min, y = reorder(industries_name, med_med)), color = "red", size = 4.5) +
  geom_point(aes(x = med_max, y = reorder(industries_name, med_med)), color = "blue", size = 4.5) +
  theme_classic() +
  theme(panel.grid.major.x = element_line(),
        axis.line.y.left = element_blank(),
        axis.ticks.y.left = element_blank(),
        axis.ticks.x.bottom = element_blank(),
        axis.line.x.bottom = element_blank(),
        axis.text.y.left = element_text(size = 12)) +
  scale_x_continuous(limit = c(100000, 300000),
                     breaks = seq(100000, 300000, 50000)) +
  scale_y_discrete(labels = c("Videos" = "Videos, Movies, and Sounds"))

It is not worth looking over the levels, because they are not accurately labeled, and do not reflect the truth

bottom10_pay_industry <- salary_aggr_by_industry %>% 
  filter(!industries_name %in% unique_industry) %>%
  slice_min(med_med, n = 10) %>% 
  pull(industries_name)
salary_aggr_by_industry %>% 
  filter(industries_name %in% bottom10_pay_industry) %>% 
  ggplot() +
  geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
  geom_point(aes(x = med_med,y = reorder(industries_name, med_med))) 

bottom20_pay_industry <- salary_aggr_by_industry %>% 
  filter(!industries_name %in% unique_industry) %>%
  slice_min(med_med, n = 20) %>% 
  pull(industries_name)

salary_aggr_by_industry %>% 
  filter(industries_name %in% bottom20_pay_industry) %>% 
  ggplot() +
  geom_dumbbell(aes(x = med_min, xend = med_max, y = reorder(industries_name, med_med))) +
  geom_point(aes(x = med_med,y = reorder(industries_name, med_med))) 

testing123 <- posting %>% 
  filter(pay_period == "HOURLY")

Re-clean posting on pay period

testing321 <- posting_clean_long %>% 
  filter(industries_name == "Hospitals and Health Care")
grad_unemp <- read_csv("CGBD2534.csv")
## Rows: 303 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): CGBD2534
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grad_unemp <- grad_unemp %>% 
  rename(date = observation_date,
         value = CGBD2534)
library(shiny)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
highlight1 <- data.frame(
  xmin = as.Date("2001-03-01"),
  xmax = as.Date("2001-10-01"),
  ymin = 0,
  ymax = 11
)

highlight2 <- data.frame(
  xmin = as.Date("2008-01-01"),
  xmax = as.Date("2009-05-01"),
  ymin = 0,
  ymax = 11
)

highlight3 <- data.frame(
  xmin = as.Date("2020-02-01"),
  xmax = as.Date("2020-05-01"),
  ymin = 0,
  ymax = 11
)

ui <- fluidPage(
  titlePanel("College Graduate Unemployment Percentage"),
  fluidRow(
    column(11,
           sliderInput("start_year",
                       "Select Start Year:",
                       min = as.numeric(format(min(grad_unemp$date), "%Y")),
                       max = as.numeric(format(Sys.Date(), "%Y")),
                       value = as.numeric(format(min(grad_unemp$date), "%Y")),
                       sep = "",
                       width = '100%')
    )
  ),
  fluidRow(
    column(11,
           plotlyOutput("linePlot")
    )
  )
)

server <- function(input, output) {
  output$linePlot <- renderPlotly({
    start_date <- as.Date(paste0(input$start_year, "-01-01"))
    filtered_data <- grad_unemp %>%
      filter(date >= start_date)

    highlight1_filtered <- highlight1 %>%
      filter(xmax >= start_date)

    highlight2_filtered <- highlight2 %>%
      filter(xmax >= start_date)

    highlight3_filtered <- highlight3 %>%
      filter(xmax >= start_date)

    p <- ggplot() +
      geom_line(data = filtered_data, aes(x = date, y = value), color = "#5C90B2") +
      labs(title = paste("Data from", input$start_year, "to Present"),
           x = "Date",
           y = "Unemployment Rate %") +
      theme_bw() +
      theme(axis.line.y.left = element_blank(),
            axis.ticks = element_blank(),
            panel.grid.major.x = element_blank()) +
      scale_y_continuous(limits = c(0, NA), breaks = seq(0, 10, 1))

    if (nrow(highlight1_filtered) > 0) {
      p <- p + geom_rect(data = highlight1_filtered,
                          aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
                          fill = "grey", alpha = 0.5, inherit.aes = FALSE)
    }

    if (nrow(highlight2_filtered) > 0) {
      p <- p + geom_rect(data = highlight2_filtered,
                          aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
                          fill = "grey", alpha = 0.5, inherit.aes = FALSE)
    }

    if (nrow(highlight3_filtered) > 0) {
      p <- p + geom_rect(data = highlight3_filtered,
                          aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
                          fill = "grey", alpha = 0.5, inherit.aes = FALSE)
    }

    ggplotly(p)  
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
grad_unemp %>% 
  filter(date >= as.Date("2019-01-01")) %>% 
  ggplot() +
  geom_line(aes(x = date, y = value))

grad_unemp %>% 
  filter(date >= as.Date("2022-01-01")) %>% 
  ggplot() +
  geom_line(aes(x = date, y = value))

salary_aggr_by_industry_count <- posting_clean_long %>% 
  group_by(industries_name) %>% 
  filter(!is.na(norm_min_salary) & !is.na(norm_med_salary) & !is.na(norm_max_salary)) %>% 
  filter(!is.na(industries_name)) %>% 
  summarise(med_min = median(norm_min_salary),
            med_med = median(norm_med_salary),
            med_max = median(norm_max_salary),
            count = n())
salary_aggr_by_industry_count %>% 
  filter(med_med > 10000 & count >= 10) %>% 
  ggplot() +
  geom_point(aes(x = med_med, y = count, color = industries_name), stat = "identity") +
  theme(legend.position = "none") +
  scale_x_log10() +
  scale_y_log10()

salary_aggr_by_industry_count %>% 
  filter(med_med > 10000 & count >= 10) %>% 
  ggplot() +
  geom_point(aes(x = count, y = med_med, color = industries_name), stat = "identity") +
  theme(legend.position = "none") +
  scale_x_log10() +
  scale_y_log10()

salary_aggr_by_industry_count %>%
  filter(med_med > 10000 & count >= 10) %>%
  plot_ly(
    x = ~med_med,
    y = ~count,
    type = "scatter",
    mode = "markers",
    text = ~industries_name,
    marker = list(size = 10, opacity = 0.7, color = 'steelblue')
  ) %>%
  layout(
    xaxis = list(
      type = "log",
      title = "Median Salary",
      tickvals = c(10000, 30000, 100000, 300000, 1000000),
      ticktext = c("10K", "30K", "100K", "300K", "1M")
    ),
    yaxis = list(
      type = "log",
      title = "Count",
      tickvals = c(10, 30, 100, 300, 1000, 3000),
      ticktext = c("10", "30", "100", "300", "1K", "3K")
    ),
    showlegend = FALSE
  )

Best Graph

salary_aggr_by_industry_count %>%
  filter(med_med > 10000 & count >= 10) %>%
  plot_ly(
    x = ~count,
    y = ~med_med,
    type = "scatter",
    mode = "markers",
    text = ~industries_name,
    marker = list(size = 10, opacity = 0.7, color = 'steelblue')
  ) %>%
  layout(
    xaxis = list(
      type = "log",
      title = "Count",
      tickvals = c(10, 30, 100, 300, 1000, 3000),
      ticktext = c("10", "30", "100", "300", "1K", "3K")
    ),
    yaxis = list(
      type = "log",
      title = "Median Salary",
      tickvals = c(50000, 70000, 100000, 150000, 200000),
      ticktext = c("50K", "70K", "100K", "150k", "200k")
    ),
    showlegend = FALSE
  )
salary_aggr_by_industry_count %>%
  filter(med_med > 10000 & count >= 10) %>%
  plot_ly(
    x = ~count,
    y = ~med_med,
    type = "scatter",
    mode = "markers",
    text = ~industries_name,
    marker = list(size = 10, opacity = 0.7, color = 'steelblue')
  ) %>%
  layout(
    xaxis = list(
      title = "Count"
    ),
    yaxis = list(
      title = "Median Salary"
    ),
    showlegend = FALSE
  )

Best graph for visualizing salary and demand

library(shiny)
library(plotly)
library(dplyr)


filtered_industry_posting <- salary_aggr_by_industry_count %>%
  filter(med_med > 10000 & count >= 10)

ui <- fluidPage(
  titlePanel("Industry Salary vs Count (Log Scale)"),
  fluidRow(
    column(12, 
           textInput("industry_search", "Search for an Industry:", "")
    )
  ),
  fluidRow(
    column(12,
           plotlyOutput("plot")
    )
  )
)

server <- function(input, output) {

  output$plot <- renderPlotly({

    plot_data <- filtered_industry_posting %>%
      mutate(
        is_highlighted = grepl(input$industry_search, industries_name, ignore.case = TRUE),
        point_color = ifelse(is_highlighted, "red", "steelblue"),
        point_size = ifelse(is_highlighted, 10, 7), 
        point_opacity = ifelse(is_highlighted, 1, 0.6)
      )

    plot_ly(
      data = plot_data,
      x = ~count,
      y = ~med_med,
      type = "scatter",
      mode = "markers",
      text = ~industries_name,
      marker = list(
        color = ~point_color,
        size = ~point_size,
        opacity = ~point_opacity,
        line = list(width = 1, color = "white")
      )
    ) %>%
      layout(
        xaxis = list(
          type = "log",
          title = "Count",
          tickvals = c(10, 30, 100, 300, 1000, 3000),
          ticktext = c("10", "30", "100", "300", "1K", "3K")
        ),
        yaxis = list(
          type = "log",
          title = "Median Salary",
          tickvals = c(50000, 70000, 100000, 150000, 200000),
          ticktext = c("50K", "70K", "100K", "150K", "200K")
        ),
        showlegend = FALSE
      )
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents

Or this, reversed axis

library(shiny)
library(plotly)
library(dplyr)

filtered_industry_posting <- salary_aggr_by_industry_count %>%
  filter(med_med > 10000 & count >= 10)

ui <- fluidPage(
  titlePanel("Expected Salary and Demand"),
  fluidRow(
    column(12, 
           textInput("industry_search", "Search for an Industry:", "")
    )
  ),
  fluidRow(
    column(12,
           plotlyOutput("plot")
    )
  )
)

server <- function(input, output) {

  output$plot <- renderPlotly({

    plot_data <- filtered_industry_posting %>%
      mutate(
        is_highlighted = grepl(input$industry_search, industries_name, ignore.case = TRUE),
        point_color = ifelse(is_highlighted, "red", "steelblue"),
        point_size = ifelse(is_highlighted, 10, 7), 
        point_opacity = ifelse(is_highlighted, 1, 0.6)
      )

    plot_ly(
      data = plot_data,
      x = ~med_med,  
      y = ~count,    
      type = "scatter",
      mode = "markers",
      text = ~industries_name,
      marker = list(
        color = ~point_color,
        size = ~point_size,
        opacity = ~point_opacity,
        line = list(width = 1, color = "white")
      )
    ) %>%
      layout(
        xaxis = list(
          type = "log",
          title = "Median Salary",
          tickvals = c(50000, 70000, 100000, 150000, 200000),
          ticktext = c("50K", "70K", "100K", "150K", "200K")
        ),
        yaxis = list(
          type = "log",
          title = "Count",
          tickvals = c(10, 30, 100, 300, 1000, 3000),
          ticktext = c("10", "30", "100", "300", "1K", "3K")
        ),
        showlegend = FALSE
      )
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
us_unemp <- read_csv("UNRATE.csv")
## Rows: 927 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): UNRATE
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
us_unemp <- us_unemp %>% 
  rename(date = observation_date,
         value = UNRATE) %>% 
  mutate(value = as.numeric(value))
ggplot(us_unemp) +
  geom_line(aes(x = date, y = value)) +
  ylim(0,NA)

library(shiny)
library(ggplot2)
library(plotly)
library(dplyr)


ui <- fluidPage(
  titlePanel("U.S. Unemployment Rates Over Time"),
  fluidRow(
    column(12,
           sliderInput("start_year",
                       "Select Start Year:",
                       min = as.numeric(format(min(grad_unemp$date), "%Y")),
                       max = as.numeric(format(Sys.Date(), "%Y")),
                       value = as.numeric(format(min(grad_unemp$date), "%Y")),
                       sep = "",
                       width = '100%'))),
  fluidRow(
    column(6,
           h4("College Graduates"),
           plotlyOutput("linePlot")),
    column(6,
           h4("US General Population"),
           plotlyOutput("linePlot2"))))


server <- function(input, output) {
  
  output$linePlot <- renderPlotly({
    start_date <- as.Date(paste0(input$start_year, "-01-01"))
    
    grad_filtered <- grad_unemp %>% filter(date >= start_date)
    us_filtered   <- us_unemp   %>% filter(date >= start_date)

    max_y <- max(c(grad_filtered$value, us_filtered$value), na.rm = TRUE)
    
    p <- ggplot(grad_filtered, aes(x = date, y = value)) +
      geom_line(color = "#5C90B2") +
      labs(title = paste("Data from", input$start_year, "to Present"),
           x = "Date", y = "Unemployment Rate %") +
      scale_y_continuous(limits = c(0, max_y)) +
      theme_bw() +
      theme(panel.grid.major.x = element_blank())

    ggplotly(p)
  })
  
  output$linePlot2 <- renderPlotly({
    start_date <- as.Date(paste0(input$start_year, "-01-01"))

    grad_filtered <- grad_unemp %>% filter(date >= start_date)
    us_filtered   <- us_unemp   %>% filter(date >= start_date)

    max_y <- max(c(grad_filtered$value, us_filtered$value), na.rm = TRUE)
    
    p <- ggplot(us_filtered, aes(x = date, y = value)) +
      geom_line(color = "#5C90B2") +
      labs(title = paste("Data from", input$start_year, "to Present"),
           x = "Date", y = "Unemployment Rate %") +
      scale_y_continuous(limits = c(0, max_y)) +
      theme_bw() +
      theme(panel.grid.major.x = element_blank())

    ggplotly(p)
  })
}


shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents